// SPDX-FileCopyrightText: Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3

#pragma once

#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/set_operations.h>
#include <thrust/sort.h>

#include "nvbench_helper.cuh"

template <typename T, typename OpT>
static void basic(nvbench::state& state, nvbench::type_list<T>, OpT op)
{
  const auto elements       = static_cast<std::size_t>(state.get_int64("Elements"));
  const auto size_ratio     = static_cast<std::size_t>(state.get_int64("SizeRatio"));
  const bit_entropy entropy = str_to_entropy(state.get_string("Entropy"));

  const auto elements_in_A = static_cast<std::size_t>(static_cast<double>(size_ratio * elements) / 100.0f);

  thrust::device_vector<T> input = generate(elements, entropy);
  thrust::device_vector<T> output(elements);

  thrust::sort(input.begin(), input.begin() + elements_in_A);
  thrust::sort(input.begin() + elements_in_A, input.end());

  caching_allocator_t alloc;
  // not a warm-up run, we need to run once to determine the size of the output
  const auto result_ends =
    op(policy(alloc),
       input.cbegin(),
       input.cbegin() + elements_in_A,
       input.cbegin() + elements_in_A,
       input.cend(),
       output.begin());
  const std::size_t elements_in_AB = ::cuda::std::distance(output.begin(), result_ends);

  state.add_element_count(elements);
  state.add_global_memory_reads<T>(elements);
  state.add_global_memory_writes<T>(elements_in_AB);

  state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
             [&](nvbench::launch& launch) {
               op(policy(alloc, launch),
                  input.cbegin(),
                  input.cbegin() + elements_in_A,
                  input.cbegin() + elements_in_A,
                  input.cend(),
                  output.begin());
             });
}

using types = nvbench::type_list<int8_t, int16_t, int32_t, int64_t>;
